/*****************************************************************************************************************************************************/
/*  Project: Associations between risk factors and AD                                                                                                */
/*  Programmer: Longgang Zhao (lz7@email.sc.edu)                                                                *    ****                            */
/*  Date: May 16, 2002, version 1                                                                               *      *                             */
/*  Supervisor: Anwar T Merchant                                                                                *     *                              */
/*  Dataset: NHANES III (https://wwwn.cdc.gov/nchs/nhanes/nhanes3/default.aspx)                                 **** ****                            */
/*  Purpose 1: Define risk factor and find clusters uisng ML                                                                                         */
/*  Purpose 2:                                                                                                                                       */
/*  Purpose 3:                                                                                                                                       */
/*****************************************************************************************************************************************************/

/*------------------------------------------------------------------------------------------------------------*/
/**********************************Part I: macro and formats***************************************************/
/*------------------------------------------------------------------------------------------------------------*/
%macro getdetails(dataset,number=20);
options ls=max; proc contents data = &dataset varnum; run; proc print data = &dataset (obs = &number); run;
%mend;

%macro getlevels(dataset, var);
title "&dataset and &var"; proc sql; select count(distinct &var) as Levels, count(*) as Nobs from &dataset; quit; title;
%mend;

%macro histogram(dataset, var, format=5.2);
ods listing close;
title "&var";
ods select histogram;
proc univariate data = &dataset noprint; 
var &var;
histogram &var/vscale=count;
inset n='Sample size'(5.0) nmiss='Missing'(5.0) mean='Mean'(&format) std='Std Dev'(&format) median='Median'(&format) min='Min'(&format) max='Max'(&format);
format &var &format;
run;
title;
ods listing;
%mend;
%macro con(datain=,conlist=);
proc means min median max mean std n nmiss data = &datain; var &conlist; run;
%mend;
%macro cat(datain=,catlist=);
proc freq data = &datain; tables &catlist/norow nocol nopercent missing; run;
%mend;
libname dataloc "C:\Users\lz7\OneDrive - University of South Carolina\AD risk factors\Summer GA 2022\Data";
proc format /*library=dataloc*/;
	value Agegp 0='<40' 1='40-<65' 2='>=65';
	value racegp 1='white' 2='black' 3='OTHER';
	value sex 1='Male' 2='Female';
	value edugp 1='1_<12 yrs' 2='>=12 yrs';
	value incomegp  1='LOW_PIR=<1.3' 2='MIDDLE_PIR=<3.5' 3='HIGH_PIR>3.5';
	value smkgp 1='NonSmoker' 2='Ever Smoker' 3='Current Smoker';
	value drinkgp 0='NonDrinker' 1='Drinker';
	value bmigp 1='NORMAL' 2='OVERWEIHGT' 3='OBESITY';
	value rpagp 1='Vigorously active' 2='Moderately active' 3='Sedentary'; 
	value yesno 1='YES' 0='NO';
	value yvisit 1='YES' 2='NO';
	value perios 1='Moderate or severe' 0='No disease' ;
run;
/*options fmtsearch=(dataloc);*/
/*------------------------------------------------------------------------------------------------------------*/
/**********************************Part II: Readin datasets ***************************************************/
/*------------------------------------------------------------------------------------------------------------*/
/*File location*/
%let masterfile=C:\Users\lz7\OneDrive - University of South Carolina\AD risk factors\Summer GA 2022\Data;
/*Read in dataset*/
%let data1A = &masterfile.\1A.July1997;
%let data2A = &masterfile.\2A.April1998;
%let data6A = &masterfile.\6A.Jan2000;
%let death  = &masterfile.\mortalitydata;
/*Read in adult*/
%include "&masterfile.\1A.July1997\adult.sas";
/*Read in exam*/
%include "&masterfile.\1A.July1997\exam.sas";
/*Read in lab*/
%include "&masterfile.\1A.July1997\lab.sas";
/*Read in adult*/
%include "&masterfile.\2A.April1998\adultx.sas";
/*Merge two adults file into one: adultall*/
%include "&masterfile.\ADULTMRG.sas";
/*Read in death file*/
%include "&death.\mortality.sas";
/*Read in spsdeppx data*/
libname spsde "&masterfile.\spsdeppx data";
data spsdeppx; set spsde.spsdeppx; run;
/*Read in HEI*/
%include "&masterfile.\6A.Jan2000\hei.sas";
/*Read in spsdeppx data*/

/*description*/
%getdetails(spsdeppx);
%getlevels(spsdeppx,SEQN); /*sample size #8,153*/ /*master file*/

%getdetails(Adultall);
%getlevels(Adultall,SEQN); /*sample size #20,050*/
%histogram(Adultall,HSAGEIR);

%getdetails(Exam);
%getlevels(Exam,SEQN); /*sample size #31,311*/

%getdetails(Lab);
%getlevels(Lab,SEQN); /*sample size #29,314*/

%getdetails(Nhanes_iii_mort);
%getlevels(Nhanes_iii_mort,SEQN); /*sample size #33,994*/

%getdetails(HEI);
%getlevels(HEI,SEQN); /*sample size #26,350*/

proc sort data=spsdeppx;        by SEQN; run;
proc sort data=Adultall;        by SEQN; run;
proc sort data=Exam;            by SEQN; run;
proc sort data=Lab;             by SEQN; run;
proc sort data=HEI;             by SEQN; run;
proc sort data=Nhanes_iii_mort; by SEQN; run;
data allinone; merge spsdeppx(in=a) Adultall(in=b) Exam(in=c) Lab(in=d) Nhanes_iii_mort(in=e) HEI(in=f); by SEQN;
if a and b and c and d and e and f;
run;
%getlevels(allinone,SEQN); /*sample size #7869*/

%histogram(allinone, HSAGEIR, format=5.2);
%histogram(allinone, HEISCORE, format=5.2);
proc freq data=allinone;
tables HAV1S HAV2S HAV3S HAV4S HAV5 HAV6S
       HAX9DG HAX9DH HAX9DI HAX9DJ HAX9DK HAX9DL HAX9DM HAX9DN HAX9DO HAX9DP HAX9DQ HAX9DR HAX9DS HAX9DT HAX9DU HAX9DV
       HAX9EG HAX9EH HAX9EI HAX9EJ HAX9EK HAX9EL HAX9EM HAX9EN HAX9EO HAX9EP HAX9EQ HAX9ER HAX9ES HAX9ET HAX9EU HAX9EV
       HAX9FG HAX9FH HAX9FI HAX9FJ HAX9FK HAX9FL HAX9FM HAX9FN HAX9FO HAX9FP HAX9FQ HAX9FR HAX9FS HAX9FT HAX9FU HAX9FV
       HAX11AG HAX11AH HAX11AI HAX11AJ HAX11AK HAX11AL HAX11AM HAX11AN HAX11AO HAX11AP HAX11AQ HAX11AR HAX11AS HAX11AT HAX11AU HAX11AV
       MQPDGPSW MQPDYSFR MQPDPSX1 MQPDEP
       MQPG05L MQPG06L HFA12;
run;

proc freq data=Exam;
tables MQPDGPSW MQPDYSFR MQPDPSX1 MQPDEP
       MQPG05L MQPG06L;
where HSAGEIR>=40;
run;
       
/*------------------------------------------------------------------------------------------------------------*/
/**********************************Part III: Covariates coding*************************************************/
/*------------------------------------------------------------------------------------------------------------*/
/**************************************************************
 MAKING 4 CLUSTERS copied from Dr. Merchant
Step 1: log transformed
Step 2: converted to standardized z-scores
variables begining with "m" mean values of 2 subgingival samples untransformed */
data allinone; 
 set  allinone;
	array micros DEPAAMX DEPPGMX DEPTD DEPTF DEPVP DEPAN DEPCR DEPEC DEPFN DEPMM DEPPI DEPSO DEPPN DEPEN DEPSI DEPCO DEPPM DEPSN DEPSM;
	do over micros; 
		if micros=0 then micros=0.0001;
	end;
run;

%let micros = DEPAAMX DEPPGMX DEPTD DEPTF DEPVP DEPAN DEPCR DEPEC DEPFN DEPMM DEPPI DEPSO DEPPN DEPEN DEPSI DEPCO DEPPM DEPSN DEPSM;

/* log transformed variables start with "lm" */
%macro logtran;
%let i = 1;
	data allinone;
		set allinone;
		%do %while (%scan(&micros.,&i.)^=%str());
			%let thevar=%scan(&micros, &i);
			l&thevar=log(&thevar);

			%let i = %eval(&i.+1);
		%end;
	run;
%mend;

%logtran;

proc standard data=allinone mean=0 std=1 replace out=allinone; var LDEP:; run;


data allinone; /* standardized variables start with "slm" */
	set allinone;
	Red_Green= (LDEPAAMX + LDEPSN+ LDEPEC+ LDEPVP + LDEPCR + LDEPTD  + LDEPTF);
	Orange_Red=(LDEPPI +LDEPPGMX + LDEPPN +LDEPPM);	
	Yellow_Orange = (LDEPFN+ LDEPsI + LDEPMM + LDEPSO+ + LDEPCO+ LDEPSM);
	Orange_Blue = (LDEPAN+ LDEPEN);

	total_clusters =(Red_Green+ Orange_Red + Yellow_Orange + Orange_Blue);

	etiologic= (LDEPAAMX + LDEPPGMX  + LDEPTD  + LDEPTF);/*  slmA__actinomycetemcomitans + slmP__gingivalis + slmT__denticola + slmT__forsythia);*/
	health=( LDEPVP + LDEPAN);	*(slmV__parvula + slmA_naeslundii);
	putative = (LDEPFN+ +  LDEPsI + LDEPMM+ LDEPCR + LDEPEC);* (slmF__nucleatum_nucleatum + slmF__nucleatum_polymorphum + slmF__nucleatum_vincentii + slmS__intermedia + slmP__micra+ slmC__rectus + slmE__corrodens_);

	total_slm =(etiologic + health+ putative);

	if total_slm=0 then Orange_Red=.;
	if total_slm=0 then Red_Green=.;
	if total_slm=0 then Yellow_Orange=.;
	if total_slm=0 then Orange_Blue=.;
	if total_slm=0 then total_clusters=.;
	if total_slm=0 then etiologic=.;
	if total_slm=0 then health=.;
	if total_slm=0 then putative=.;
	if total_slm=0 then total_slm=.;
run;

/*%getdetails(allinone);*/
proc means data=allinone n nmiss min median max; var HSAGEIR dmppir BMPBMI BMPWAIST PEPMNK1R PEPMNK5R WCP GBP CRP TCP LCP HDP TGP LPP; run;

proc freq data=allinone; 
 tables HAA4 HAA5 HAA6A HAA6B HAA6C HAA6D HAP17A1 HAP17A2 HAP17A3 HAP19A HAP19B HAP19C HAP18A HAP18B HAP18C HAP18D HAP18E;
run;


data allinone; set allinone;
/*=================Morality coding======================*/
/*ALL-CAUSE MORTALITY;*/
 if UCOD_LEADING='' then ALL_DEATH=0; else ALL_DEATH=1; 
 UCOD_LEADING=input(UCOD_LEADING,3.0); 
/*CVD MORTALITY; */
 if  UCOD_LEADING =001 or UCOD_LEADING =005 then CVD_DEATH=1; else CVD_DEATH=0;
/*CANCER MORTALITY;*/
 if  UCOD_LEADING =002 then CA_DEATH=1;	else CA_DEATH=0; 
/*Alzheimer MORTALITY;*/
 if  UCOD_LEADING =006 then ALZ_DEATH=1; else ALZ_DEATH=0; 
/*DIABETES MORTALITY*/
 if  UCOD_LEADING =007 or DIABETES =1 then DB_DEATH=1; else DB_DEATH=0;

 if CVD_DEATH=0 then status_CVD=0; else status_CVD=1;
 if ALL_DEATH=0 then status_All=0; else status_All=1;
 if CA_DEATH=0  then status_CA=0;  else status_CA=1;
 if ALZ_DEATH=0 then status_ALZ=0; else status_ALZ=1;

/*=================Covariates coding======================*/
/*Marital status: HFA12
01 Married -spouse in household
02 Married -spouse not in household
03 Living as married
04 Widowed
05 Divorced
06 Separated
07 Never married
88 Blank but applicable
99 Don't know
*/
/*Age group*/
  Age=HSAGEIR;
if HSAGEIR=. then Agegp=.;
if HSAGEU=1 then do;
 if HSAGEIR<480      then Agegp=0;
  else if HSAGEIR<780 then Agegp=1; 
   else                     Agegp=2; 
 end;
if HSAGEU=2 then do;
 if HSAGEIR<40      then Agegp=0;
  else if HSAGEIR<65 then Agegp=1; 
   else                    Agegp=2; 
 end;
/*Race*/
if DMARACER =1 then racegp=1;
 else if DMARACER=2 then racegp=2;
  else if DMARACER=3 or DMARACER = 8 then racegp=3;
	else racegp=.;
/*Sex*/
  sex = hssex;

/*Education*/
 education=HFA8R;
if HFA8R in (.,88,99) then education=.;;
 if 0<=HFA8R<12 then edugp=1;
  else if HFA8R >=12 then edugp=2;
   else  edugp=.;

/*Income: PIR*/
if dmppir in (.,888888) then dmppir=.;
if 0=< dmppir <=1.3   then incomegp=1; 
 else if dmppir <=3.5  then incomegp=2; 
  else if dmppir >3.5   then incomegp=3; 
   else                       incomegp=.;

/*Smoking*/
if HAR1=2                 then smkgp=1;
 else if HAR1=1 AND HAR3=2 then smkgp=2;
  else if HAR1=1 AND HAR3=1 then smkgp=3;
	else                          smkgp=.;

/*Drinking*/
if HAN6HS in  (888,999) then HAN6HS=.;
if HAN6IS in  (888,999) then HAN6IS=.;
if HAN6JS in  (888,999) then HAN6JS=.;
drink_wk= Sum(HAN6HS,HAN6IS,HAN6JS)*(1/4.3);/* the conversion factors: 30.4 days/month and 4.3 weeks/month*/
if drink_wk =0      then drinkgp=0;
 else if drink_wk >0 then drinkgp=1; 
  else                     drinkgp=.;

/*BMI*/
if BMPBMI in(.,8888) then BMPBMI = .;
if BMPBMI <10 then BMPBMI=.;
bmi=BMPBMI;
if BMPBMI = .         then bmigp = .;
 else if BMPBMI<25     then bmigp = 1;
  else if 25<=BMPBMI<30 then bmigp = 2;
   else if BMPBMI>=30    then bmigp = 3;

/*Waist circumference ---bmpwaist;*/
if BMPWAIST   IN (8888, 888,88888 ) then BMPWAIST =.;
WC = BMPWAIST;
 
/*Physical activity*/
array hat {*}  HAT2 HAT4 HAT6 HAT8 HAT10 HAT12 HAT14 HAT16 HAT18;
do i=1 to dim(hat);
 if hat{i} in (8) then hat{i}=.;
end; drop i;
if HAT19MET in (8888,9998,9999) then HAT19MET = .;

array freq {*} HAT1S HAT3S HAT5S HAT7S HAT9S HAT11S HAT13S HAT15S HAT17S HAT20S HAT22S HAT24S HAT26S;
array activity {*} ACTIVITY1 ACTIVITY2 ACTIVITY3 ACTIVITY4 ACTIVITY5 ACTIVITY6 ACTIVITY7 ACTIVITY8 ACTIVITY9 ACTIVITY10 ACTIVITY11 ACTIVITY12 ACTIVITY13;
do i= 1 to dim(freq);
 if freq{i} in (8888,9998,9999) then freq{i}=.; activity{i}=freq{i}/4.3;
end; drop i;

if ACTIVITY1=. AND ACTIVITY2=. AND ACTIVITY3=. AND ACTIVITY4=. AND ACTIVITY5=. AND ACTIVITY6=. AND ACTIVITY7=. AND 
		ACTIVITY8=. AND ACTIVITY9=. AND ACTIVITY10=. AND ACTIVITY11=. AND ACTIVITY12=. AND ACTIVITY13=. AND 
		HAT2=. AND HAT4=. AND HAT6=. AND HAT8=. AND HAT10=. AND HAT12=. AND HAT14=. AND HAT16=. AND HAT18=. AND 
		HAT19MET = . AND HAT21MET = . AND HAT23MET = . AND HAT25MET = . then rpagp=.;
 else if ACTIVITY1 >= 5 OR ACTIVITY2 >= 3 OR ACTIVITY3 >=5 OR ACTIVITY4 >= 3  OR ACTIVITY5 >= 3 OR 
			ACTIVITY6 >= 5  OR ACTIVITY7 >= 5 OR ACTIVITY8 >= 5 OR ACTIVITY9 >= 5 then rpagp=3;
 else if (HAT19MET>=6 AND ACTIVITY10 >=3) OR (3=<HAT19MET<6 AND ACTIVITY10 >=5) OR 
			(HAT21MET>=6 AND ACTIVITY11 >=3) OR (3=<HAT21MET<6 AND ACTIVITY11 >=5) OR
			(HAT23MET>=6 AND ACTIVITY12 >=3) OR (3=<HAT23MET<6 AND ACTIVITY12 >=5) OR
			(HAT25MET>=6 AND ACTIVITY13 >=3) OR (3=<HAT25MET<6 AND ACTIVITY13 >=5)  then rpagp=3;
 else if ACTIVITY1=0 AND HAT2=2 AND HAT4=2 AND HAT6=2 AND HAT8=2 AND HAT10=2 AND HAT12=2 AND 
			HAT14=2 AND HAT16=2 AND HAT18=2 then rpagp=1;
 else rpagp=2;
/*Dietary quality*/
HEI=HEISCORE;
/*=================Outcome coding======================*/
/*Social connectedness*/
scl_phone    = HAV1S; if scl_phone    in (5555,8888,9998) then scl_phone    = .;
scl_together = HAV2S; if scl_together in (88888,99999)    then scl_together = .;
scl_neighb   = HAV3S; if scl_neighb   in (88888,99999)    then scl_neighb   = .;
scl_church   = HAV4S; if scl_church   in (8888,9999)      then scl_church   = .;
scl_clubs    = HAV5;  if scl_clubs    in (8)              then scl_clubs    = .;
scl_meeting  = HAV6S; if scl_meeting  in (8888,9998)      then scl_meeting  = .;
/*Anxiety and depression*/
array drug {*} HAX9DG HAX9DH HAX9DI HAX9DJ HAX9DK HAX9DL HAX9DM HAX9DN HAX9DO HAX9DP HAX9DQ HAX9DR HAX9DS HAX9DT HAX9DU HAX9DV
       HAX9EG HAX9EH HAX9EI HAX9EJ HAX9EK HAX9EL HAX9EM HAX9EN HAX9EO HAX9EP HAX9EQ HAX9ER HAX9ES HAX9ET HAX9EU HAX9EV
       HAX9FG HAX9FH HAX9FI HAX9FJ HAX9FK HAX9FL HAX9FM HAX9FN HAX9FO HAX9FP HAX9FQ HAX9FR HAX9FS HAX9FT HAX9FU HAX9FV;
drug_anxiety=0;
drug_depression=0;
drug_dm=0;
drug_cvd=0;
drug_hp=0;
drug_ca=0;
drug_hc=0;
do i=1 to dim(drug);
 if drug{i} in ("0627") then    drug_anxiety=1;
 if drug{i} in ("0630") then drug_depression=1;
 if drug{i} in ("7000") then drug_dm=1;
 if drug{i} in ("0512","3053","0642","0643","1138","1139","1458","1647","3069") then drug_cvd=1;
 if drug{i} in ("3072","1069","3056","3014","1886","2425") then drug_hp=1;
 if drug{i} in ("0930","2109","0974","0976","0978","0979","0980") then drug_ca=1;
 if drug{i} in ("1664","3064","3088","1876","0808","0912") then drug_hc=1;
end; drop i;

array ICD {*} HAX11AG HAX11AH HAX11AI HAX11AJ HAX11AK HAX11AL HAX11AM HAX11AN HAX11AO HAX11AP HAX11AQ HAX11AR 
              HAX11AS HAX11AT HAX11AU HAX11AV;
ICD_anxiety=0;
ICD_depression=0;
ICD_dm=0;
ICD_cvd=0;
ICD_hp=0;
ICD_ca=0;
ICD_hc=0;
do i=1 to dim(ICD);
 if ICD{i} in ("300.0") then ICD_anxiety=1;
 if ICD{i} in ("311") then ICD_depression=1;
 if ICD{i} in ("250.0","250.9") then ICD_dm=1;
 if ICD{i} in ("391.9","401.9","410.9","413.9","414.0","416.0","423.9","424.0","427.0","427.1","427.31","427.41",
               "427.69","427.9","428.0","428.9","429.2","429.8","429.9","435.9","436","437.7","437.9","440.2","440.9",
               "443.9","444.22","444.9","446.5","447.9","451.2","451.9","453.9","455.6","458.9","459.0","459.9") then ICD_cvd=1; /*Diseases Of The Circulatory System 390-459*/
 if ICD{i} in ("401.9") then ICD_hp=1;
 if ICD{i} in ("149.0","153.3","173.4","173.9","174.9","185","189.0","193","199.1","208.9","238.2","239.6","239.9") then ICD_ca=1; /*Neoplasms 140-239*/
 if ICD{i} in ("272.0") then ICD_hc=1;
end; drop i;
anxiety=0;    if drug_anxiety=1    or ICD_anxiety=1    then anxiety=1;
depression=0; if drug_depression=1 or ICD_depression=1 then depression=1;
/*Appetite-(MQPG05L,MQPG06L) only available for adults<40*/

/*Cognition*/
/*1-17 scale for 6 orientation,6 reacall and 5 attention*/
array HAA {*} HAA4 HAA5 HAA6A HAA6B HAA6C HAA6D HAP17A1 HAP17A2 HAP17A3 HAP19A HAP19B HAP19C;
array score {*} score1 score2 score3 score4 score5 score6 score7 score8 score9 score15 score16 score17;
do i=1 to dim(HAA);
 if HAA{i}= 1      then score{i}=1;
  else if HAA{i}= 2 then score{i}=0;
   else                   score{i}=.;
end;drop i;

array HAP {*} HAP18A HAP18B HAP18C HAP18D HAP18E;
array new {*} score10 score11 score12 score13 score14;
do i=1 to dim(HAP);
 if HAP{i} not in (55,88,99,.) then new{i}=1;
  else if HAP{i}=55 then new{i}=0;
end; drop i;

cognition=sum(score1,score2,score3,score4,score5,score6,score7,score8,score9,score10,score11,score12,score13,score14,score15,score16,score17);

/*=================Disease history======================*/
/*History of Diabetes*/
if had1 in (.,8,9)                        then hsDM =.; 
 else if had1=1 and hssex =1               then hsDM=1;
  else if (had1=1 and had4 ^=2) and hssex=2 then hsDM=1; /*exclude women who had gestational DM only*/
	else                                          hsDM=0;
if hsDM=1 or drug_dm=1 or ICD_dm=1 then diabetes=1; else diabetes=0;

/*History of CVD/stroke*/
if HAC1D in (.,8, 9) and HAF10 in (.,8,9) then hsCVD=.; 
 else if HAC1D=1 or HAF10 =1 THEN hsCVD=1; 
  else hsCVD=0;

if hsCVD=1 or drug_cvd=1 or ICD_cvd=1 then CVD=1; else CVD=0;

/*History of hypertension*/
if HAE2=1 then hsHP=1;
 else if HAE2=2 then hsHP=0;
  else hsHP=.;
if hsHP=1 or drug_hp=1 or ICD_hp=1 then HP=1; else HP=0;

/*History of cancer*/
if HAC1N in (.,8,9) and HAC1O in (.,8,9) then hsCancer=.; 
 else if HAC1N=1 or HAC1O =1 then hsCancer=1; 
  else hsCancer=0;
if hsCancer=1 or drug_ca=1 or ICD_ca=1 then Cancer=1; else Cancer=0;

/*History of hypercholesterolemia*/
if HAE7=1 then hsChol=1;
 else if HAE7=2 then hsChol=0;
  else hsChol=0;
if hsChol=1 or drug_hc=1 or ICD_hc=1 then HC=1; else HC=0;

/*=================Biomarker coding======================*/
/*sBP --pepmnk1r;*/
    if PEPMNK1R in (88888,8888, 888) then PEPMNK1R = .;
    SBP = PEPMNK1R;
/*dBP---pepmnk5r;*/
	if PEPMNK5R in (8888, 888) then PEPMNK5R = .;
	DBP=PEPMNK5R;
/*WBC---WCP;*/
	if WCP in (8888, 88888) then WCP=.;
	WBC = WCP;
/*fibrinogen-GBP;*/
	if GBP in (8888, 888) then GBP=.;
/*CRP---CRP;*/
	if CRP in (8888, 88888) then CRP=.;
/*total cholesterol--TCP;*/
	if TCP in (8888, 888) then TCP=.;
/*LDL cholesterol---LCP;*/
	if LCP in (8888, 888) then LCP=.;
/*HDL cholesterol--HDP;*/
	if HDP in (8888, 888) then HDP=.;
/*triglycerides (need fasting sample for this)---TGP + (WTPFHSD6 >0 );*/
	if TGP in (8888, 888) then TGP=.;
/*lipoprotein(a) ---LPP;*/ 
	if LPP in (8888, 888) then LPP=.;

/*=================Dental coding======================*/
/*Annual visits to dentist*/
if haq4 in (.,88,99) then haq4= .; 
if haq4=1 then yvisit=1; 
 else if 1<haq4<6 then yvisit=2; 
  else yvisit=.;
/*periodental disease*/
IF DEPUMPC1 IN (88,99) THEN DEPUMPC1  =.;IF DEPUMPC2 IN (88,99) THEN DEPUMPC2  =.;IF DEPUMPC3 IN (88,99) THEN DEPUMPC3  =.;IF DEPUMPC4 IN (88,99) THEN DEPUMPC4  =.;IF DEPUMPC5 IN (88,99) THEN DEPUMPC5  =.;IF DEPUMPC6 IN (88,99) THEN DEPUMPC6  =.;IF DEPUMPC7 IN (88,99) THEN DEPUMPC7  =.;
IF DEPUBPC1 IN (88,99) THEN DEPUBPC1  =.;IF DEPUBPC2 IN (88,99) THEN DEPUBPC2  =.;IF DEPUBPC3 IN (88,99) THEN DEPUBPC3  =.;IF DEPUBPC4 IN (88,99) THEN DEPUBPC4  =.;IF DEPUBPC5 IN (88,99) THEN DEPUBPC5  =.;IF DEPUBPC6 IN (88,99) THEN DEPUBPC6  =.;IF DEPUBPC7 IN (88,99) THEN DEPUBPC7  =.;
IF DEPlBPC1 IN (88,99) THEN DEPlBPC1  =.;IF DEPlBPC2 IN (88,99) THEN DEPlBPC2  =.;IF DEPlBPC3 IN (88,99) THEN DEPlBPC3  =.;IF DEPlBPC4 IN (88,99) THEN DEPlBPC4  =.;IF DEPlBPC5 IN (88,99) THEN DEPlBPC5  =.;IF DEPlBPC6 IN (88,99) THEN DEPlBPC6  =.;IF DEPlBPC7 IN (88,99) THEN DEPlBPC7  =.;
IF DEPlMPC1 IN (88,99) THEN DEPlMPC1  =.;IF DEPlMPC2 IN (88,99) THEN DEPlMPC2  =.;IF DEPlMPC3 IN (88,99) THEN DEPlMPC3  =.;IF DEPlMPC4 IN (88,99) THEN DEPlMPC4  =.;IF DEPlMPC5 IN (88,99) THEN DEPlMPC5  =.;IF DEPlMPC6 IN (88,99) THEN DEPlMPC6  =.;IF DEPlMPC7 IN (88,99) THEN DEPlMPC7  =.;
IF DEPUMLA1 IN (88,99) THEN DEPUMLA1  =.;IF DEPUMLA2 IN (88,99) THEN DEPUMLA2  =.;IF DEPUMLA3 IN (88,99) THEN DEPUMLA3  =.;IF DEPUMLA4 IN (88,99) THEN DEPUMLA4  =.;IF DEPUMLA5 IN (88,99) THEN DEPUMLA5  =.;IF DEPUMLA6 IN (88,99) THEN DEPUMLA6  =.;IF DEPUMLA7 IN (88,99) THEN DEPUMLA7  =.;
IF DEPUBLA1 IN (88,99) THEN DEPUBLA1  =.;IF DEPUBLA2 IN (88,99) THEN DEPUBLA2  =.;IF DEPUBLA3 IN (88,99) THEN DEPUBLA3  =.;IF DEPUBLA4 IN (88,99) THEN DEPUBLA4  =.;IF DEPUBLA5 IN (88,99) THEN DEPUBLA5  =.;IF DEPUBLA6 IN (88,99) THEN DEPUBLA6  =.;IF DEPUBLA7 IN (88,99) THEN DEPUBLA7  =.;
IF DEPLBLA1 IN (88,99) THEN DEPLBLA1  =.;IF DEPLBLA2 IN (88,99) THEN DEPLBLA2  =.;IF DEPLBLA3 IN (88,99) THEN DEPLBLA3  =.;IF DEPLBLA4 IN (88,99) THEN DEPLBLA4  =.;IF DEPLBLA5 IN (88,99) THEN DEPLBLA5  =.;IF DEPLBLA6 IN (88,99) THEN DEPLBLA6  =.;IF DEPLBLA7 IN (88,99) THEN DEPLBLA7  =.;
IF DEPLMLA1 IN (88,99) THEN DEPLMLA1  =.;IF DEPLMLA2 IN (88,99) THEN DEPLMLA2  =.;IF DEPLMLA3 IN (88,99) THEN DEPLMLA3  =.;IF DEPLMLA4 IN (88,99) THEN DEPLMLA4  =.;IF DEPLMLA5 IN (88,99) THEN DEPLMLA5  =.;IF DEPLMLA6 IN (88,99) THEN DEPLMLA6  =.;IF DEPLMLA7 IN (88,99) THEN DEPLMLA7  =.;

array a {*} DEPUMLA1 DEPUBLA1 DEPUMLA2 DEPUBLA2 DEPUMLA3 DEPUBLA3 DEPUMLA4 DEPUBLA4 DEPUMLA5 DEPUBLA5
			DEPUMLA6 DEPUBLA6 DEPUMLA7 DEPUBLA7 DEPLMLA1 DEPLBLA1 DEPLMLA2 DEPLBLA2 DEPLMLA3 DEPLBLA3 DEPLMLA4 DEPLBLA4 DEPLMLA5
			DEPLBLA5 DEPLMLA6 DEPLBLA6 DEPLMLA7 DEPLBLA7;
array b {*} b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15 b16 b17 b18 b19 b20 b21 b22 b23 b24 b25 b26 b27 b28;
do i=1 to dim(a);
	if a{i}=. then b{i}=.; else if a{i}>=3 then b{i}=1; else b{i}=0; 
end;drop i;
alsum=sum (of b1--b28);
array c {*} depumpc1 depubpc1 depumpc2 depubpc2 depumpc3 depubpc3 depumpc4 depubpc4 depumpc5 depubpc5 depumpc6 depubpc6
		   depumpc7 depubpc7 deplmpc1 deplbpc1 deplmpc2 deplbpc2 deplmpc3 deplbpc3 deplmpc4 deplbpc4 deplmpc5 deplbpc5
	       deplmpc6 deplbpc6 deplmpc7 deplbpc7;
array d {*} d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 d16 d17 d18 d19 d20 d21 d22 d23 d24 d25 d26 d27 d28;
do i=1 to dim(c);
	if c{i}=. then d{i}=.; else if c{i}>=5 then d{i}=1; else d{i}=0; 
end;drop i;
pcsum=sum (of d1--d28);
if alsum>=1 OR pcsum>=1 then perioS=1; else perioS=0; 
/*=================check mssing======================*/
array covar {*} Agegp racegp sex edugp incomegp smkgp drinkgp bmigp rpagp hsDM hsCVD hsHP hsCancer hsChol HEI;
ind_missing=0;
do i=1 to dim(covar);
 if covar{i} = . then ind_missing=1;
end; drop i;
/*=================coding end======================*/
	label SEQN='Respondent identification number';
	label Agegp='Age group';
	label racegp='Race';
	label edugp='Education';
	label incomegp='Income:poverty ratio';

	label smkgp='Smoking status';
	label drinkgp='Drinking status';
	label bmigp='Body mass index';
	label WC='Waist Circumference';
	label rpagp='physical activity';
    label hsDM='history of diabetes';
    label hsCVD='history of CVD or stroke';
	label hsHP='history of hypertension';
	label hsCancer='history of cancer';
	label hsChol='history of hypercholesterol';
	label diabetes='Confirmed diabetes';
	label CVD='Confirmed CVD or stroke';
	label HP='Confirmed hypertension';
	label Cancer='Confirmed cancer';
	label HC='Confirmed hypercholesterol';
	label yvisit='Annual visits to dentist';
	label perios = 'Periodontal Destruction';

	label scl_phone='# of phone calls per wk with family/friends/neighbors';
    label scl_together='# of gathering per year with friends/relatives';
    label scl_neighb='# of gathering per year with neighbors';
    label scl_church='# of times per year attending church';
    label scl_clubs='whether belong to any clubs';
    label scl_meeting='# of meetings per year for the clubs';
	label anxiety='Anxiety reported in drugs or ICD codes';
	label depression='Depression reported in drugs or ICD codes';
	label cognition='Total scores for cognition';

	label SBP="Systolic Blood Pressure";
	label DBP="Diastolic Blood Pressure";
	label WBC="WBC";
	label GBP="Fibrinogen";
	label CRP="CRP";
	label TCP="Total Cholesterol";
	label LCP="LDL Cholesterol";
	label HDP="HDL cholesterol";
	label TGP="Triglycerides";
	label LPP="Lipoprotein(a)";
	label HEI="HEI score";
format Agegp Agegp. racegp racegp. sex sex. edugp edugp. incomegp incomegp. smkgp smkgp. drinkgp drinkgp. bmigp bmigp. rpagp rpagp. 
       hsDM yesno. hsCVD yesno. hsHP yesno. hsCancer yesno. hsChol yesno. anxiety yesno. depression yesno.
	   diabetes yesno. CVD yesno. HP yesno. Cancer yesno. HC yesno. drug_hc yesno.
       yvisit yvisit. perios perios. scl_clubs yvisit.;
run;

data allinonenew; set allinone;
array impute {*} drinkgp hsDM hsCVD hsHP hsCancer hsChol anxiety depression diabetes CVD HP Cancer HC;
do i=1 to dim(impute);
 if impute{i}=. then impute{i}=0;
end; drop i;
if smkgp=. then smkgp=1;
if bmigp=. then bmigp=1;
if yvisit=. then yvisit=2;

 if cognition^=.;
 if cognition<=10 then cogn_cat=1;
  else                  cogn_cat=0;
format cogn_cat yesno.;
if scl_clubs=. then scl_clubs=0;
if scl_clubs=2 and scl_meeting=. then scl_meeting=0;
run;
proc freq data=allinonenew; 
 tables hsDM diabetes hsDM*diabetes hsCVD CVD hsCVD*CVD hsHP HP hsHP*HP hsCancer Cancer hsCancer*Cancer hsChol HC hsChol*HC 
        ICD_dm ICD_cvd ICD_hp ICD_ca ICD_hc drug_dm drug_cvd drug_hp drug_ca drug_hc hsChol*drug_hc/norow nocol nopercent;
run;
proc contents data = allinonenew varnum; run;

data dataloc.ADriskfactors220626; set allinonenew;
 keep SEQN SDPPSU6 SDPSTRA6 WTPFQX6 WTPFEX6 WTPFHX6 
      Agegp racegp sex edugp incomegp drinkgp smkgp bmigp rpagp hsDM hsCVD hsHP hsCancer hsChol anxiety depression ind_missing
	  diabetes CVD HP Cancer HC drug_hc
	  scl_phone scl_together scl_neighb scl_church scl_clubs scl_meeting Age bmi WC cognition HEI
      yvisit perios
	  SBP DBP WBC GBP CRP TCP LCP HDP TGP LPP
      drink_wk Age dmppir education
      Red_Green Orange_Red Yellow_Orange Orange_Blue total_clusters cogn_cat;
run;


data dataloc.ADriskfactorsSVM; set allinonenew;
 keep SEQN  
      racegp sex edugp incomegp drinkgp smkgp rpagp hsCVD hsChol anxiety depression
	  diabetes HP Cancer
	  scl_phone scl_together scl_neighb scl_church scl_meeting Age bmi WC HEI
      yvisit perios
	  SBP DBP WBC CRP TCP LCP HDP TGP
      Red_Green Orange_Red Yellow_Orange Orange_Blue total_clusters cogn_cat;
run;


proc format library=dataloc;
	value Agegp 0='<40' 1='40-<65' 2='>=65';
	value racegp 1='white' 2='black' 3='OTHER';
	value sex 1='Male' 2='Female';
	value edugp 1='1_<12 yrs' 2='>=12 yrs';
	value incomegp  1='LOW_PIR=<1.3' 2='MIDDLE_PIR=<3.5' 3='HIGH_PIR>3.5';
	value smkgp 1='NonSmoker' 2='Ever Smoker' 3='Current Smoker';
	value drinkgp 0='NonDrinker' 1='Drinker';
	value bmigp 1='NORMAL' 2='OVERWEIHGT' 3='OBESITY';
	value rpagp 1='Vigorously active' 2='Moderately active' 3='Sedentary'; 
	value yesno 1='YES' 0='NO';
	value yvisit 1='YES' 2='NO';
	value perios 1='Moderate or severe' 0='No disease' ;
run;



ods pdf file="C:\Users\lz7\OneDrive - University of South Carolina\AD risk factors\Summer GA 2022\Results\description0626.pdf";
proc odstext pagebreak=off;
 p "The basic sample size is 4,251."/style={fontstyle=italic};
 p "Variables list:"/style={fontstyle=italic};
 p "Demographic: Age (both continuous and categorical), race, sex, education, income (PIR, both continuous and categorical),
    smoking, drinking, BMI (both continuous and categorical),Waist Circumference (both continuous and categorical), 
    physical activity (both continuous and categorical), disease history of diabetes, CVD, hyperthension, cancer, 
    hypercholesterol,Social connectedness";
 p "Mental health: anxiety, depression, cognition (continuous)";
 p "Periodontal diseases: Annual visits to dentist, Periodontal Destruction";
 p "Serum biomarkers: SBP DBP WBC GBP CRP TCP LCP HDP TGP LPP";
 p "Imputation was done for drinking, history of diabetes, CVD, hypertension, hypercholesterol, anexitey, depression, smoking, BMI group, visit year.";
 p "Clusters: Red_Green Orange_Red Yellow_Orange Orange_Blue total_clusters";
run;
title 'Sample with all covariates';
%con(datain=allinone, conlist=Age bmi WC SBP DBP WBC GBP CRP TCP LCP HDP TGP LPP scl_phone scl_together scl_neighb scl_church scl_meeting 
            cognition Red_Green Orange_Red Yellow_Orange Orange_Blue total_clusters);
%cat(datain=allinone, catlist=Agegp racegp sex edugp incomegp smkgp drinkgp bmigp rpagp hsDM hsCVD hsHP hsCancer hsChol anxiety depression 
            yvisit perios scl_clubs ind_missing);
title 'Sample with imputation';
%cat(datain=allinonenew, catlist=Agegp racegp sex edugp incomegp smkgp drinkgp bmigp rpagp hsDM hsCVD hsHP hsCancer hsChol diabetes CVD HP Cancer HC 
     anxiety depression yvisit perios scl_clubs);
title;
ods pdf close;
